Introduction
This report assesses the following in regards to the provided bibliography:
Remarks: - Group ID is in the json, but not in the csv. The group ID makes it possible to directly jump to the reference in the Zotero Library online.
Bibliography Setup
The bibliography is loaded and the DOIs, ISBNs and ISSNs are extracted. In a second step, the corresponding works are downloaded from [OpenAlex(https://openalex.org)].
Show the code
#|
bib <- read.csv (params$ bibliography)
dois <- bib$ DOI
names (dois) <- bib$ Key
dois <- dois[dois != "" ]
dois <- dois |>
gsub (pattern = "^https://doi.org/" , replacement = "" ) |>
gsub (pattern = "^https://dx.doi.org/" , replacement = "" ) |>
gsub (pattern = "^https://hdl.handle.net/" , replacement = "" ) |>
gsub (pattern = "^http://doi.org/" , replacement = "" ) |>
gsub (pattern = "^http://dx.doi.org/" , replacement = "" ) |>
gsub (pattern = "^http://hdl.handle.net/" , replacement = "" ) |>
gsub (pattern = "^doi:" , replacement = "" ) |>
gsub (pattern = "^DOI " , replacement = "" )
isbns <- bib$ ISBN
names (isbns) <- bib$ Key
isbns <- isbns[isbns != "" ]
issns <- bib$ DOI
names (issns) <- bib$ Key
issns <- issns[issns != "" ]
In a second step we retrieve the works based on the DOIs fromOpenAlex.
Show the code
#|
fn <- file.path ("." , "data" , "bib_works.rds" )
if (file.exists (fn)) {
bib_works <- readRDS (fn)
} else {
doi_chunks <- split (unique (dois), ceiling (seq_along (unique (dois)) / 199 ))
#
bib_works <- pbmcapply:: pbmclapply (
seq_along (doi_chunks),
function (i) {
oa_fetch (
entity = "works" ,
doi = doi_chunks[[i]],
per_page = 200 ,
verbose = FALSE
)
},
mc.cores = params$ mc.cores
) |>
do.call (what = rbind)
saveRDS (bib_works, fn)
}
dois_oa <- bib_works$ doi |>
gsub (pattern = "^https://doi.org/" , replacement = "" ) |>
gsub (pattern = "^https://dx.doi.org/" , replacement = "" ) |>
gsub (pattern = "^https://hdl.handle.net/" , replacement = "" ) |>
gsub (pattern = "^http://doi.org/" , replacement = "" ) |>
gsub (pattern = "^http://dx.doi.org/" , replacement = "" ) |>
gsub (pattern = "^http://hdl.handle.net/" , replacement = "" ) |>
gsub (pattern = "^doi:" , replacement = "" ) |>
gsub (pattern = "^DOI " , replacement = "" )
Data Quality of the Bibliography
Cleanliness of bibliography
One measure of the cleanliness of a Bibliography is assessed by checking the number of references that have a DOI. The following table gives an overview over some numbers regarding the DOIs, ISBNs and ISSNs in the bibliography.
Entries with DOIs, ISBNs or ISSNs
To identify a reference, the most widely used identifier is the DOI. The following table shows the number of references with a DOI and the number of unique DOIs.
To consider duplicate ISBNs or ISSNs as duplicates entries in the library is not waranted as e.g. differenc chapters of a book can be separate entries in the library and therefore lead toi duplicates.
DOIs : 3863 (73.8482126%) - 2 duplicates
ISBNs : 453 (8.6599121%) - 57 duplicates
ISSNs : 3863 (73.8482126%) - 2 duplicates
The following DOIs are duplicates in the bibliography. This table should be empty.
Show the code
# duplicate_isbns <- paste0("https://isbnsearch.org/search?s=", dois[duplicated(isbns)])
# duplicate_issns <- paste0("", dois[duplicated(issns)])
data.frame (
Type = "doi" ,
Identifier = sprintf ('<a href="https://doi.org/%s" target="_blank">%s</a>' , dois[duplicated (dois)], dois[duplicated (dois)])
) |>
knitr:: kable (
caption = "Duplicate DOIs in the Bibliography" ,
escape = FALSE
)
DOIs in Open Alex
To validate the existence and validity of the DOIs, we check if the DOIs are in the OpenAlex database.
Show the code
dois_not_in_oa <- unique (dois)[! (unique (dois) %in% dois_oa)]
dois_valid <- dois_not_in_oa[IPBES.R:: doi_valid (dois_not_in_oa)]
Of the 3861 in the library which have a DOI, 471 (12.1989122%) are in not in OpenAlex.
Show the code
data.frame (
Type = "doi" ,
Identifier = sprintf ('<a href="https://doi.org/%s" target="_blank">%s</a>' , dois_not_in_oa, dois_not_in_oa)
) |>
IPBES.R:: table_dt (caption = "The Caption" )
Of these 3 are not valid. These are:
Show the code
data.frame (
Type = "doi" ,
Identifier = sprintf ('<a href="https://doi.org/%s" target="_blank">%s</a>' , dois_not_in_oa[! (dois_not_in_oa %in% dois_valid)], dois_not_in_oa[! (dois_not_in_oa %in% dois_valid)])
) |>
knitr:: kable (
caption = "Non Valid DOIs in the Bibliography" ,
escape = FALSE
)
TODO Finally we check, if these dois exist but are not ingested into OpanAlex. This is done using the doi.org resolver This is disabled at the moment.
Show the code
dois_exist <- IPBES.R:: doi_exists (
dois_valid,
cache_file = file.path ("." , "cache" , "doi_exist.rds" )
)
Show the code
to_check <- dois[! (dois %in% dois_oa)]
dois_valid <- IPBES.R:: doi_valid (dois)
dois_openalex <- dois %in% dois_oa
names (dois_openalex) <- dois
dois_exist <- IPBES.R:: doi_exists (to_check, cache_file = file.path ("." , "cache" , "doi_exist.rds" ))
dois_not_retracted <- IPBES.R:: doi_not_retracted (dois, cache_file = file.path ("." , "cache" , "doi_not_retracted.rds" ))
sprintf (
fmt = paste (
"Number of references: \t\t %d" ,
"Number of DOIs: \t\t %d" ,
"Number of Duplicate DOIs: \t %d" ,
"Number of DOIs in OpenAlex: \t %d ( %f %)" ,
"Number of Existing DOIs: \t %d" ,
"Number of Retracted DOIs: \t %d" ,
"Percentage of Duplicate DOIs: \t %f" ,
sep = " \n "
),
nrow (bib),
sum (! is.na (dois)),
length (dois) - length (unique ((dois))),
sum (dois_openalex), 100 * sum (dois_openalex) / nrow (bib),
sum (dois_exist),
sum (! dois_not_retracted),
((dois_valid |> unique () |> length ()) / length (dois_valid)) |> round (digits = 3 ) * 100
) |> cat ()
Show the code
oldopts <- options (knitr.kable.NA = "" )
data.frame (
Measure = c (
"# References" ,
"**DOI**" ,
"# DOIs" ,
"# Duplicate DOIs" ,
"# Existing DOIs" ,
"# Retracted DOIs" ,
"% Duplicate DOIs" ,
"**ISBN**" ,
"# ISBNs" ,
"# Duplicate ISBNs" ,
"**ISSN**" ,
"# ISSNs" ,
"# Duplicate ISSNs"
),
Value = c (
nrow (bib),
NA ,
sum (! is.na (dois)),
length (dois) - length (unique ((dois))),
sum (dois_exist),
sum (! dois_not_retracted),
((dois_valid |> unique () |> length ()) / length (dois_valid)) |> round (digits = 3 ) * 100 ,
NA ,
sum (! is.na (isbns)),
length (isbns) - length (unique ((isbns))),
NA ,
sum (! is.na (issns)),
length (issns) - length (unique ((issns)))
)
) |>
knitr:: kable (
caption = "Cleanliness of the Bibliography" ,
)
options (oldopts)
Contentual and Bibliographic analysis
Publication types
Show the code
bib |>
dplyr:: group_by (
Item.Type
) |>
dplyr:: summarize (
count = n ()
) |>
dplyr:: arrange (
desc (count)
) |>
knitr:: kable ()
journalArticle
4157
bookSection
305
book
244
report
236
webpage
97
conferencePaper
83
document
67
thesis
15
dataset
12
blogPost
9
preprint
2
magazineArticle
1
newspaperArticle
1
presentation
1
videoRecording
1
Year of Publication
Show the code
data_bib <- bib |>
dplyr:: group_by (
Publication.Year,
Item.Type
) |>
dplyr:: summarize (
count = n (),
p = count / sum (count),
) |>
dplyr:: group_by (
Item.Type
) |>
dplyr:: arrange (
Publication.Year
) |>
mutate (
count_cumsum = cumsum (count),
p_cumsum = cumsum (p)
) |>
dplyr:: rename (
publication_year = Publication.Year,
type = Item.Type
) |>
dplyr:: mutate (
type = dplyr:: case_match (
type,
"journalArticle" ~ "article" ,
.default = type
)
)
`summarise()` has grouped output by 'Publication.Year'. You can override using
the `.groups` argument.
Show the code
data_works <- bib_works |>
dplyr:: group_by (
publication_year,
type
) |>
dplyr:: summarize (
count = n (),
p = count / sum (count),
) |>
dplyr:: group_by (
type
) |>
dplyr:: arrange (
publication_year
) |>
mutate (
count_cumsum = cumsum (count),
p_cumsum = cumsum (p)
) |>
dplyr:: rename (
publication_year = publication_year,
type = type,
count_oa = count,
p_oa = p,
count_oa_cumsum = count_cumsum,
p_oa_cumsum = p_cumsum
)
`summarise()` has grouped output by 'publication_year'. You can override using
the `.groups` argument.
Show the code
data <- dplyr:: full_join (
x = data_bib,
y = data_works,
by = c ("publication_year" , "type" )
)
rm (data_bib, data_works)
data |>
dplyr:: filter (publication_year >= 1950 ) |>
ggplot () +
scale_fill_viridis_d (option = "plasma" ) +
geom_line (aes (x = publication_year, y = count_cumsum / 10 , colour = type), linetype = "solid" ) + # Zotero
geom_line (aes (x = publication_year, y = count_oa_cumsum / 10 , colour = type), linetype = "dashed" ) + # OpenAlex
scale_x_continuous (
breaks = seq (1500 , 2020 , 10 )
) +
scale_y_continuous (
"Proportion of publications" ,
sec.axis = sec_axis (~ . * 10 , name = "Cumulative number of references" ) # divide by 10 to scale back the secondary axis
) +
labs (
title = "Publications over time" ,
x = "Year" ,
y = "Number of publications"
) +
theme_minimal () +
theme (axis.text.y.right = element_text (color = "red" )) +
theme (legend.position = "bottom" ) +
guides (
fill = guide_legend (
title = "Legend"
)
)
Warning: Removed 11 rows containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 198 rows containing missing values or values outside the scale range
(`geom_line()`).
Access Status of References
This is checked by using the OpenAlex retrieved works. Therefore it is li=mited to the works that are on OpenAlex. At the moment, only references with a DOI were retrieved from OpenAlex.
Show the code
bib_works |>
ggplot (
aes (
x = publication_year,
fill = oa_status
)
) +
geom_bar (
position = "fill"
) +
scale_fill_manual (values = c ("#CD7F32" , "red" , "gold" , "green" , "pink" )) +
ggtitle ("Publication Year" ) +
theme (
plot.title = element_text (size = 15 )
) +
theme (legend.position = "bottom" )
50 Most often cited Journals
Show the code
data <- bib |>
dplyr:: group_by (
Publication.Title
) |>
dplyr:: summarise (
count = n ()
) |>
dplyr:: rename (
Journal = Publication.Title
) |>
arrange (
desc (count)
)
data |>
dplyr:: slice_max (
count,
n = 50
) |>
ggplot (
aes (
x = reorder (Journal, - count),
y = count
)
) +
geom_bar (
stat = "identity" ,
fill = "steelblue"
) +
coord_flip () +
labs (
x = "Journal" ,
y = "Count"
) +
ggtitle (
"Most often cited Journals"
) +
theme (
plot.title = element_text (size = 15 )
)
This table contains all Journals as specified in the Zotero database.
Show the code
IPBES.R:: table_dt (data, "cited_journals" )
TODO Coutries of Institutes of all authors
sapply(bib_works$author, function(x){x[“institution_country_code”]}) |> unlist() |> table() |> sort()
Show the code
bib_works |>
dplyr:: group_by (
institution_country_code
)
top_countries <- sapply (bib_works$ author, function (x) {
x["institution_country_code" ]
}) |>
unlist () |>
table () |>
sort (decreasing = TRUE ) |>
head (50 )
data.frame (
Country = names (top_countries),
Count = as.numeric (top_countries)
) |>
ggplot (
aes (
x = reorder (Country, - Count),
y = Count
)
) +
geom_bar (
stat = "identity" ,
fill = "steelblue"
) +
coord_flip () +
labs (
x = "Country" ,
y = "Count"
) +
ggtitle (
"Countries of Institutes of all Authors"
) +
theme (
plot.title = element_text (size = 15 )
)
TODO ILK References in Bibliography
Show the code
#|
ilk_id <- list (
doi = ilk$ DOI[dois_valid] |>
unique (),
isbn = ilk$ ISBN[ilk$ ISBN != "" ] |>
unique (),
issn = ilk$ ISSN[ilk$ ISSN != "" ] |>
unique ()
)
# ilk_snowball_id <- list(
# doi = ilk_snowball$nodes$doi[ilk_snowball$nodes$doi != ""] |>
# unique()
# )
va <- read.csv (file.path ("input" , "./Values Assessment.csv" ))
va_id <- list (
doi = va$ DOI[va$ DOI != "" ] |>
unique (),
isbn = va$ ISBN[va$ ISBN != "" ] |>
unique (),
issn = va$ ISSN[va$ ISSN != "" ] |>
unique ()
)
In the ILK database, there are nrow(ilk) references, while in the Value Assessment bibliography has nrow(va) references.
We now determine the overlap of these two by comparing the DOIs.
Show the code
#|
# both_id <- list(
# doi = va_id$doi[(va_id$doi %in% ilk_id$doi)],
# isbn = va_id$isbn[(va_id$isbn %in% ilk_id$isbn)],
# issn = va_id$issn[(va_id$issn %in% ilk_id$issn)] # ,
# # doi_snowball = va_id$doi[(va_id$doi %in% gsub("https://doi.org/", "", ilk_snowball_id$doi))]
# )
# count <- sapply(both_id, length)
# va_ref <- sapply(va_id, length)
# va_ref <- c(va_ref, doi_snowball = va_ref[[1]])
# data.frame(
# ref_ILK = length(),
# va_ref = va_ref,
# p = round(count / va_ref, digits = 3)
# ) |>
# knitr::kable(
# caption = "Overlap of the ILK and the Values Assessment Bibliography",
# col.names = c("ILK ref in VA", "n VA ref", "p")
# )
data.frame (
No.ILK.Ref = nrow (ilk),
No.Valid.DOIs = length (dois_valid),
No.VA.Ref = nrow (va),
No.DOI.both = sum (dois_valid %in% va$ DOI),
No.ISBN.both = sum ((ilk$ ISBN[ilk$ ISBN != "" ]) %in% va$ ISBN),
No.ISSN.both = sum ((ilk$ ISSN[ilk$ ISSN != "" ]) %in% va$ ISBN)
) |> knitr:: kable (
caption = "Number of references in ILK Database"
)
TODO TO BE REPLACED WITH TOPICS Assessment of OpenAlex Concepts
Show the code
#|
concepts <- do.call (rbind, ilk_works[["concepts" ]])
concepts_summary <- concepts |>
# filter(level == 1) |>
mutate (score = round (score * 100 )) |>
group_by (display_name, level) |>
summarize (score = mean (score), count = n ()) |>
arrange (desc (count))
Show the code
#|
ggraph (graph = as_tbl_graph (ilk_snowball), layout = "stress" ) +
geom_edge_link (aes (alpha = after_stat (index)), show.legend = FALSE ) +
geom_node_point (aes (fill = oa_input, size = cited_by_count), shape = 21 , color = "white" ) +
geom_node_label (aes (filter = oa_input, label = id), nudge_y = 0.2 , size = 3 ) +
scale_edge_width (range = c (0.1 , 1.5 ), guide = "none" ) +
scale_size (range = c (3 , 10 ), guide = "none" ) +
scale_fill_manual (values = c ("#a3ad62" , "#d46780" ), na.value = "grey" , name = "" ) +
theme_graph () +
theme (
plot.background = element_rect (fill = "transparent" , colour = NA ),
panel.background = element_rect (fill = "transparent" , colour = NA ),
legend.position = "bottom"
) +
guides (fill = "none" )